student <- read.table("student-mat.csv", sep=";", header=T)
student <- as_tibble(student)
student_all <- read.table("student-mat.csv", sep=";", header=T)
student_all <- as_tibble(student)
The inital covariates in the dataset are:
And the response variables are:
For this analysis, we will be considering the G3 final grade as our response.
student %>%
ggplot(aes(G3)) + geom_histogram(binwidth = 1, color='black', fill = 'grey') +
geom_vline(xintercept = mean(student$G3), lwd = 1, color = 'red') +
labs(title = "Distribution of Final Math Grade", x="Final Grade", y="Count")
student %>% select(G1, G2, G3) %>% filter(G3 ==0 & G1 ==0 & G2 ==0)
## # A tibble: 0 x 3
## # … with 3 variables: G1 <int>, G2 <int>, G3 <int>
Let’s remove the students with a grade of 0, as that doesn’t make much practical sense, especially since all of these students have grades higher than 0 first and/or second period.
**BB: agree with this approach, if we are feeling adventurous we could treat those 0’s as missing and impute them but let’s not over complicate things yet # MICE code if we decide to impute ## student <- read.table(“student-mat.csv”, sep=“;”, header=T) student <- as.data.frame(student)
student\(G3[student\)G3 == 0] <- NA md.pattern(as.data.frame(student)) #38 students missing G3, almost 10% of data is missing imputed_data <- mice(student, m=5, maxit = 50, method = “pmm”, seed = 500) summary(imputed_data) imputed_data\(imp\)G3 imputed<-complete(imputed_data)
#Note this throws a warning about variables being non-numeric but still produces output for all numeric arguments. Also gives additional reason to look at G3 only as G1/G2 are highly correlated with G3
ggcorr(student)
student <- student %>% filter(G3 != 0) %>% select(-G1, -G2)
student_all <- student_all %>% select(-G1, -G2)
student %>%
ggplot(aes(G3)) + geom_histogram(binwidth = 1, color='black', fill = 'grey') +
geom_vline(xintercept = mean(student$G3), lwd = 1, color = 'red') +
labs(title = "Distribution of Final Math Grade", x="Final Grade", y="Count")
#BB: I don't see any real differences
ggcorr(student)
student %>%
gather(key = "Variable", value = "Value", -G3) %>%
ggplot(aes(Value, G3)) + facet_wrap(~ Variable, scales = "free") +
geom_boxplot()
## visualizing absences
student %>%
# filter(absences < 23) %>%
mutate(absences = factor(absences)) %>%
ggplot(aes(absences, G3)) +
geom_boxplot()
ggplot(student, aes(absences, G3)) +
geom_jitter() +
geom_smooth()
Since the variable “absences” has a greater range of values than most of the other variables, and since after 22 absences the data becomes extremely sparse, we decided to group 23-93 absences within the value “23.”
## Pooling all absences greater than 22 into the 23 group
student <- student %>%
mutate(absences = case_when(
absences > 22 ~ 23L,
TRUE ~ .$absences))
## visualizing new absences variable
student %>%
# filter(absences < 23) %>%
mutate(absences = factor(absences)) %>%
ggplot(aes(absences, G3)) +
geom_boxplot()
ggplot(student, aes(absences, G3)) +
geom_jitter() +
geom_smooth()
“Absences” still has a much greater spread of values than the other variables, so we will try binning them to see how that effects our models.
## Binning absences into 5 groups with 5 absence numbers each (except for the last group that includes all absences greater than or equal to 20).
# Binning
student <- student %>%
mutate(absences = case_when(
absences < 5 ~ 1L,
absences >= 5 & absences < 10 ~ 2L,
absences >= 10 & absences < 15 ~ 3L,
absences >= 15 & absences < 20 ~ 4L,
absences >= 20 ~ 5L,
TRUE ~ .$absences))
student_all <- student_all %>%
mutate(absences = case_when(
absences < 5 ~ 1L,
absences >= 5 & absences < 10 ~ 2L,
absences >= 10 & absences < 15 ~ 3L,
absences >= 15 & absences < 20 ~ 4L,
absences >= 20 ~ 5L,
TRUE ~ .$absences))
## visualizing new absences variable
student %>%
# filter(absences < 23) %>%
mutate(absences = factor(absences)) %>%
ggplot(aes(absences, G3)) +
geom_boxplot()
ggplot(student, aes(absences, G3)) +
geom_jitter() +
geom_smooth()
## visualizing alcohol consumption
# weekend alcohol consumption (5 is highest)
student %>%
mutate(Walc = factor(Walc)) %>%
ggplot(aes(Walc, G3)) +
geom_violin() +
geom_jitter(width = 0.2, alpha = 0.5)
# workday alcohol consumption (5 is highest)
student %>%
mutate(Dalc = factor(Dalc)) %>%
ggplot(aes(Dalc, G3)) +
geom_violin() +
geom_jitter(width = 0.2, alpha = 0.5)
## Time spend Studying
student %>%
mutate(studytime = factor(studytime)) %>%
ggplot(aes(studytime, G3)) +
geom_violin() +
geom_jitter(width = 0.2, alpha = 0.5)
## Time spend travelling
student %>%
mutate(traveltime = factor(traveltime)) %>%
ggplot(aes(traveltime, G3)) +
geom_violin() +
geom_jitter(width = 0.2, alpha = 0.5)
## Guardian
student %>%
mutate(guardian = factor(guardian)) %>%
ggplot(aes(guardian, G3)) +
geom_violin() +
geom_jitter(width = 0.2, alpha = 0.5)
Based on these plots, we decided to remove the variable “guardian” from our dataset since it didn’t seem to have any influence on the outcome.
student <- student %>%
select(-guardian)
student_all <- student_all %>%
select(-guardian)
Just to get an initial sense of influential variables we can look at all variables up to this point and examine fit statistics. This bit of code will look at permutations of all possible models up to nvmax and then report the best model based on fit criteria. For example, with nvmax of 30 it will produce the best model for 1 covariate, 2 covariates…up to the full model.
#Based on results the best models fall within 5-20 vars, so I only considered 20 variables in this run
best_subset <- leaps::regsubsets(G3 ~ ., student, nvmax = 20, method='exhaustive')
results<-summary(best_subset)
tibble(predictors = 1:20,
adj_R2 = results$adjr2,
Cp = results$cp,
BIC = results$bic) %>%
gather(statistic, value, -predictors) %>%
ggplot(aes(predictors, value, color = statistic)) +
geom_line(show.legend = F) +
geom_point(show.legend = F) +
facet_wrap(~ statistic, scales = "free")
#which.max(results$adjr2)
# 17 variables
#which.min(results$bic)
# 8 variables
#which.min(results$cp)
# 14 variables
coef(best_subset, 8)
## (Intercept) Mjobhealth Mjobservices Fjobteacher failures
## 14.6123286 1.7919419 1.4684797 2.0295712 -1.1000624
## schoolsupyes goout health absences
## -2.3415181 -0.4729677 -0.2581331 -0.4284993
coef(best_subset, 7)
## (Intercept) Mjobhealth Mjobservices Fjobteacher failures
## 13.6819891 1.6852313 1.3837170 2.0288848 -1.1282540
## schoolsupyes goout absences
## -2.3052617 -0.4655962 -0.4141869
# We lose health when we drop to 7 variables. Otherwise all variables remain the same and are very similar in their effect size.
VIF and Tolerance
covariates_int <- model.matrix(G3 ~ ., data = student )
covariates <- covariates_int %>% as.data.frame() %>% select(-`(Intercept)`)
student_d <- data.frame(G3 = student$G3, covariates)
model1 <- lm(G3 ~ ., data = student_d)
VIF = car::vif(model1)
Tol = 1/VIF
df = data.frame(Variable = names(VIF), VIF = VIF, Tolerance = Tol)
df %>% arrange(desc(VIF)) %>% head(10) %>% knitr::kable(align = c("c", "c"))
| Variable | VIF | Tolerance |
|---|---|---|
| Fjobother | 6.361461 | 0.1571966 |
| Fjobservices | 5.576098 | 0.1793369 |
| Mjobteacher | 3.294857 | 0.3035033 |
| Mjobservices | 3.087159 | 0.3239224 |
| Mjobother | 2.919526 | 0.3425213 |
| Medu | 2.800062 | 0.3571349 |
| Fjobteacher | 2.778815 | 0.3598657 |
| Walc | 2.455796 | 0.4071999 |
| Mjobhealth | 2.423713 | 0.4125901 |
| Fjobhealth | 2.277646 | 0.4390498 |
#set up dataframe for all students dataset:
covariates_int_all <- model.matrix(G3 ~ ., data = student_all )
covariates_all <- covariates_int_all %>% as.data.frame() %>% select(-`(Intercept)`)
student_d_all <- data.frame(G3 = student_all$G3, covariates_all)
Displayed above are the covariates with the highest Variance Inflation Factors.
EigenAnalysis for the Correlation and Scaled SSCP Matrices
xtx = t(covariates_int)%*%covariates_int
Ds_half <- diag(diag(xtx)^-0.5)
sscp <- Ds_half %*% xtx %*% Ds_half
eig_sscp <- eigen(sscp)$values
PCs_sscp <- prcomp(sscp)[2]
CI_sscp <- sqrt(eig_sscp[1]/eig_sscp)
cov_center <- apply(covariates, 2, function(y) y - mean(y))
C <- (t(cov_center)%*%cov_center)/dim(cov_center)[1]
Dc_half <- diag(diag(C)^-0.5)
R <- Dc_half %*% C %*% Dc_half
eig_corr <- eigen(R)$values
CI_corr <- sqrt(eig_corr[1]/eig_corr)
PCs_corr <- prcomp(R)[2]
df <- data.frame("Eigenvalue" = c(eig_corr), "Condition Index" = c(CI_corr))
df2 <- data.frame("Eigenvalue" = c(eig_sscp), "Condition Index" = c(CI_sscp))
df %>% tail(2) %>% knitr::kable(align = c("c", "c"))
| Eigenvalue | Condition.Index | |
|---|---|---|
| 36 | 0.1041715 | 5.642819 |
| 37 | 0.0685539 | 6.955916 |
df2 %>% filter(`Condition.Index` > 30) %>% knitr::kable(align = c("c", "c"))
| Eigenvalue | Condition.Index |
|---|---|
| 0.0136049 | 39.61935 |
| 0.0015105 | 118.90222 |
From the Correlation Matrix, the highest Condition Index is still very small, which means no collinearity needs to be taken care of there. From the Scaled SSCP Matrix, there are two Condition Indices over 30.
PCs_sscp$rotation[1:17,37:38]
## PC37 PC38
## [1,] -0.3618352857 -0.780147743
## [2,] -0.0583779551 0.045804345
## [3,] -0.0003378973 0.003672985
## [4,] 0.7739347801 0.043089256
## [5,] -0.0097099818 0.025486111
## [6,] -0.0159420047 0.020592838
## [7,] -0.0154425398 0.029105089
## [8,] 0.2177425660 -0.223620008
## [9,] -0.0290724347 0.039995199
## [10,] -0.0895722284 0.100756307
## [11,] -0.1160828807 0.137012164
## [12,] -0.1177675539 0.135309188
## [13,] -0.1228980639 0.136175538
## [14,] -0.0892427816 0.121509903
## [15,] -0.2922080273 0.384335810
## [16,] -0.1975498340 0.265454245
## [17,] -0.1237038792 0.156172844
Above shows the last 2 PC’s of the Scaled SSCP Matrix for the 1st-17th covariates (the rest have values close to 0). We can see that the 8th and 11-17th covariates may be collinear with the intercept. These covariates are listed below.
colnames(student_d)[c(8, 11:17)]
## [1] "Medu" "Mjobother" "Mjobservices" "Mjobteacher"
## [5] "Fjobhealth" "Fjobother" "Fjobservices" "Fjobteacher"
Many of these are factor variables that have reference values in the itnercept, so it makes sense they migh tbe collinear. Not going to remove any.
There are many types of models to consider when building a regression classifier, such as Multiple Linear Regression, ANOVA, Linear Mixed Models, and Logistic/Poisson Regression. Classically, Linear Mixed Models are used when observations are related and Logistic/Poisson Regression are used for binrary outcomes. Since we are assuming our observations (the students) are independent for our purpose, and the outcome variable (final grade) is numeric, we narrow our model search down to Multiple Linear Regression and ANOVA. Since, at least to start, we have many factor-type covariates, we begin exploring Multiple Linear Regression models and their fit to our data.
*BB: since categorical vars are being grouped into several vars we have more than 30 Betas. I count 39 below.
The first model we consider is a full model of all covariates possible and no interaction terms included. The model can be written as follows:
\[y = \beta_0 + \beta_1X_1 + ... + \beta_{37}X_{37} + \epsilon\] where \(\beta_i\) represents … and \(X_i\) represents a covariate. There are more covariates than expected because each factor variable is recreated as a series of “dummy” variables. If a factor has \(n\) levels, then there will be \(n-1\) variables, and therefore parameters, corresponding to that covariate.
summary(model1)
##
## Call:
## lm(formula = G3 ~ ., data = student_d)
##
## Residuals:
## Min 1Q Median 3Q Max
## -6.5188 -1.8720 -0.0267 1.7499 6.7850
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 14.61066 3.05721 4.779 2.69e-06 ***
## schoolMS -0.64858 0.56699 -1.144 0.25353
## sexM 0.55609 0.35535 1.565 0.11860
## age -0.12640 0.14917 -0.847 0.39743
## addressU 0.39356 0.41203 0.955 0.34022
## famsizeLE3 0.27863 0.34494 0.808 0.41983
## PstatusT -0.22593 0.50392 -0.448 0.65421
## Medu 0.17537 0.22586 0.776 0.43805
## Fedu 0.14964 0.19497 0.767 0.44336
## Mjobhealth 1.12848 0.80364 1.404 0.16123
## Mjobother -0.58067 0.52629 -1.103 0.27072
## Mjobservices 0.83289 0.58827 1.416 0.15780
## Mjobteacher -0.72248 0.74703 -0.967 0.33421
## Fjobhealth -0.66808 1.01706 -0.657 0.51174
## Fjobother -0.67694 0.74744 -0.906 0.36579
## Fjobservices -0.79406 0.77542 -1.024 0.30659
## Fjobteacher 1.11130 0.94595 1.175 0.24095
## reasonhome 0.27837 0.39911 0.697 0.48601
## reasonother -0.03962 0.57521 -0.069 0.94514
## reasonreputation 0.10187 0.41203 0.247 0.80488
## traveltime 0.08056 0.24340 0.331 0.74087
## studytime 0.49671 0.20833 2.384 0.01770 *
## failures -0.83202 0.25772 -3.228 0.00137 **
## schoolsupyes -2.45423 0.46660 -5.260 2.65e-07 ***
## famsupyes -0.67436 0.34156 -1.974 0.04921 *
## paidyes -0.43441 0.33592 -1.293 0.19689
## activitiesyes 0.09041 0.32289 0.280 0.77965
## nurseryyes -0.30106 0.39882 -0.755 0.45087
## higheryes 0.46715 0.86024 0.543 0.58748
## internetyes 0.60066 0.44647 1.345 0.17947
## romanticyes -0.07390 0.34233 -0.216 0.82922
## famrel 0.10233 0.17818 0.574 0.56616
## freetime 0.12442 0.16765 0.742 0.45854
## goout -0.45074 0.16641 -2.709 0.00712 **
## Dalc 0.06612 0.23181 0.285 0.77564
## Walc -0.16801 0.17870 -0.940 0.34783
## health -0.23882 0.11388 -2.097 0.03678 *
## absences -0.40500 0.15112 -2.680 0.00774 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 2.786 on 319 degrees of freedom
## Multiple R-squared: 0.3324, Adjusted R-squared: 0.2549
## F-statistic: 4.292 on 37 and 319 DF, p-value: 3.505e-13
** ADJUSTED R2 of 0.256 BOO **
In order to determine the validity of this model, we must test the assumptions of linear regression. These assumptions include homogeneity of variances and gaussian errors.
stud_resid = rstudent(model1)
dat_resid <- data.frame(RESID = stud_resid, ABS = abs(stud_resid), PRED = model1$fitted.values)
dat_resid %>% arrange(-ABS) %>% filter(ABS > 2)
## RESID ABS PRED
## 1 2.635034 2.635034 12.214983
## 2 2.519203 2.519203 11.335257
## 3 2.491389 2.491389 10.519023
## 4 -2.460927 2.460927 11.518753
## 5 2.399035 2.399035 12.656637
## 6 2.390195 2.390195 12.762561
## 7 2.385355 2.385355 11.623074
## 8 2.385337 2.385337 11.809618
## 9 -2.333870 2.333870 12.250495
## 10 2.287008 2.287008 12.051073
## 11 -2.267416 2.267416 12.054327
## 12 2.218734 2.218734 8.168950
## 13 2.135855 2.135855 7.458621
## 14 -2.095574 2.095574 11.358037
## 15 -2.089929 2.089929 11.402067
## 16 -2.088318 2.088318 11.569132
## 17 2.073456 2.073456 12.555000
## 18 -2.022951 2.022951 12.239201
## 19 -2.022002 2.022002 11.400394
Since this set of largest studentized residuals are all greater than 2, it may be of interest to examine them in further detail. ** this part might be better in the outlier detection section **
nortest::ad.test(dat_resid$RESID)
##
## Anderson-Darling normality test
##
## data: dat_resid$RESID
## A = 0.29561, p-value = 0.5935
From the code above, since the p-value 0.4184 is greater than \(\alpha = 0.05\), we accept the null hypothesis that the residuals are normally distributed.
p1 <- ggplot(data = dat_resid) + geom_histogram(aes(RESID), bins = 25) + labs(x = "Studentized Residuals")
p2 <- ggplot(data = dat_resid) + geom_point(aes(PRED, RESID)) + geom_hline(aes(yintercept = 0)) + labs(x = "Predicted Values", y = "Studentized Residuals")
cowplot::plot_grid(p1, p2)
Since, from the collinearity analysis, several variables were significantly colinear with the intercept, we next try to build a model identical to the full model with the removal of the intercept.
model2 <- lm(G3 ~ . -1, data = student_d)
summary(model2)
##
## Call:
## lm(formula = G3 ~ . - 1, data = student_d)
##
## Residuals:
## Min 1Q Median 3Q Max
## -6.761 -1.929 -0.042 1.828 7.146
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## schoolMS -1.24515 0.57164 -2.178 0.030121 *
## sexM 0.63578 0.36687 1.733 0.084064 .
## age 0.46305 0.08671 5.340 1.77e-07 ***
## addressU 0.67056 0.42162 1.590 0.112728
## famsizeLE3 0.35817 0.35610 1.006 0.315267
## PstatusT 0.07654 0.51671 0.148 0.882335
## Medu 0.29486 0.23201 1.271 0.204684
## Fedu 0.21016 0.20109 1.045 0.296774
## Mjobhealth 0.96155 0.82983 1.159 0.247425
## Mjobother -0.53027 0.54385 -0.975 0.330280
## Mjobservices 0.77578 0.60789 1.276 0.202814
## Mjobteacher -0.96861 0.77026 -1.257 0.209490
## Fjobhealth 0.41117 1.02495 0.401 0.688566
## Fjobother 0.24795 0.74618 0.332 0.739882
## Fjobservices 0.21367 0.77124 0.277 0.781920
## Fjobteacher 1.86877 0.96387 1.939 0.053403 .
## reasonhome 0.37596 0.41196 0.913 0.362140
## reasonother 0.32402 0.58929 0.550 0.582809
## reasonreputation 0.14274 0.42577 0.335 0.737653
## traveltime 0.29676 0.24719 1.201 0.230819
## studytime 0.56438 0.21483 2.627 0.009025 **
## failures -0.90236 0.26594 -3.393 0.000778 ***
## schoolsupyes -1.96735 0.47063 -4.180 3.76e-05 ***
## famsupyes -0.61231 0.35277 -1.736 0.083577 .
## paidyes -0.50016 0.34691 -1.442 0.150344
## activitiesyes 0.14108 0.33355 0.423 0.672603
## nurseryyes -0.06798 0.40911 -0.166 0.868137
## higheryes 1.95951 0.82846 2.365 0.018615 *
## internetyes 0.83239 0.45872 1.815 0.070527 .
## romanticyes -0.26294 0.35145 -0.748 0.454922
## famrel 0.13159 0.18405 0.715 0.475140
## freetime 0.28533 0.16975 1.681 0.093752 .
## goout -0.50623 0.17157 -2.951 0.003406 **
## Dalc -0.02395 0.23879 -0.100 0.920164
## Walc -0.10116 0.18413 -0.549 0.583111
## health -0.16548 0.11663 -1.419 0.156928
## absences -0.49462 0.15498 -3.191 0.001556 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 2.88 on 320 degrees of freedom
## Multiple R-squared: 0.9481, Adjusted R-squared: 0.9421
## F-statistic: 158 on 37 and 320 DF, p-value: < 2.2e-16
** ADJUSTED R2 OF .9421 !!!!!! **
stud_resid = rstudent(model2)
dat_resid <- data.frame(RESID = stud_resid, ABS = abs(stud_resid), PRED = model2$fitted.values)
dat_resid %>% arrange(-ABS) %>% filter(ABS > 2)
## RESID ABS PRED
## 1 2.613736 2.613736 11.854497
## 2 2.501713 2.501713 12.334575
## 3 2.478232 2.478232 11.352717
## 4 -2.442490 2.442490 12.760977
## 5 -2.376990 2.376990 12.287373
## 6 2.336166 2.336166 11.601764
## 7 2.324132 2.324132 11.575175
## 8 -2.314948 2.314948 11.345418
## 9 2.290939 2.290939 10.829309
## 10 2.252149 2.252149 11.943310
## 11 2.221999 2.221999 7.963607
## 12 -2.119537 2.119537 11.856084
## 13 2.116726 2.116726 12.254707
## 14 -2.095148 2.095148 10.695254
## 15 2.077735 2.077735 12.329525
## 16 2.067167 2.067167 13.404662
## 17 2.056832 2.056832 7.481619
## 18 -2.050303 2.050303 15.462264
## 19 -2.020740 2.020740 11.400989
nortest::ad.test(dat_resid$RESID)
##
## Anderson-Darling normality test
##
## data: dat_resid$RESID
## A = 0.18833, p-value = 0.9014
p1 <- ggplot(data = dat_resid) + geom_histogram(aes(RESID), bins = 25) + labs(x = "Studentized Residuals")
p2 <- ggplot(data = dat_resid) + geom_point(aes(PRED, RESID)) + geom_hline(aes(yintercept = 0)) + labs(x = "Predicted Values", y = "Studentized Residuals")
cowplot::plot_grid(p1, p2)
set.seed(12345)
modelback <- step(model1, direction = "backward", trace = F)
modelback
##
## Call:
## lm(formula = G3 ~ schoolMS + sexM + Mjobhealth + Mjobservices +
## Fjobteacher + studytime + failures + schoolsupyes + famsupyes +
## paidyes + internetyes + goout + health + absences, data = student_d)
##
## Coefficients:
## (Intercept) schoolMS sexM Mjobhealth Mjobservices
## 13.4364 -0.8717 0.5457 1.8393 1.4309
## Fjobteacher studytime failures schoolsupyes famsupyes
## 2.0964 0.4958 -1.0237 -2.2893 -0.5501
## paidyes internetyes goout health absences
## -0.4545 0.7021 -0.4796 -0.2657 -0.4324
modelboth <- step(model1, direction = "both", trace = F)
modelboth
##
## Call:
## lm(formula = G3 ~ schoolMS + sexM + Mjobhealth + Mjobservices +
## Fjobteacher + studytime + failures + schoolsupyes + famsupyes +
## paidyes + internetyes + goout + health + absences, data = student_d)
##
## Coefficients:
## (Intercept) schoolMS sexM Mjobhealth Mjobservices
## 13.4364 -0.8717 0.5457 1.8393 1.4309
## Fjobteacher studytime failures schoolsupyes famsupyes
## 2.0964 0.4958 -1.0237 -2.2893 -0.5501
## paidyes internetyes goout health absences
## -0.4545 0.7021 -0.4796 -0.2657 -0.4324
modelforward <- step(model1, direction = "forward", trace = F)
modelforward
##
## Call:
## lm(formula = G3 ~ schoolMS + sexM + age + addressU + famsizeLE3 +
## PstatusT + Medu + Fedu + Mjobhealth + Mjobother + Mjobservices +
## Mjobteacher + Fjobhealth + Fjobother + Fjobservices + Fjobteacher +
## reasonhome + reasonother + reasonreputation + traveltime +
## studytime + failures + schoolsupyes + famsupyes + paidyes +
## activitiesyes + nurseryyes + higheryes + internetyes + romanticyes +
## famrel + freetime + goout + Dalc + Walc + health + absences,
## data = student_d)
##
## Coefficients:
## (Intercept) schoolMS sexM age
## 14.61066 -0.64858 0.55609 -0.12640
## addressU famsizeLE3 PstatusT Medu
## 0.39356 0.27863 -0.22593 0.17537
## Fedu Mjobhealth Mjobother Mjobservices
## 0.14964 1.12848 -0.58067 0.83289
## Mjobteacher Fjobhealth Fjobother Fjobservices
## -0.72248 -0.66808 -0.67694 -0.79406
## Fjobteacher reasonhome reasonother reasonreputation
## 1.11130 0.27837 -0.03962 0.10187
## traveltime studytime failures schoolsupyes
## 0.08056 0.49671 -0.83202 -2.45423
## famsupyes paidyes activitiesyes nurseryyes
## -0.67436 -0.43441 0.09041 -0.30106
## higheryes internetyes romanticyes famrel
## 0.46715 0.60066 -0.07390 0.10233
## freetime goout Dalc Walc
## 0.12442 -0.45074 0.06612 -0.16801
## health absences
## -0.23882 -0.40500
Backward and Both give the same variables.
Forward gives all the variables.
DOES NOT give same exact answer if we use model1 with intercept or model2 without to do the stepwise process.
summary(modelback)
##
## Call:
## lm(formula = G3 ~ schoolMS + sexM + Mjobhealth + Mjobservices +
## Fjobteacher + studytime + failures + schoolsupyes + famsupyes +
## paidyes + internetyes + goout + health + absences, data = student_d)
##
## Residuals:
## Min 1Q Median 3Q Max
## -6.2622 -1.9473 -0.0014 1.8415 7.2137
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 13.4364 0.8452 15.898 < 2e-16 ***
## schoolMS -0.8717 0.4727 -1.844 0.066026 .
## sexM 0.5457 0.3172 1.720 0.086296 .
## Mjobhealth 1.8393 0.5304 3.467 0.000593 ***
## Mjobservices 1.4309 0.3429 4.173 3.82e-05 ***
## Fjobteacher 2.0964 0.5664 3.701 0.000250 ***
## studytime 0.4958 0.1895 2.616 0.009286 **
## failures -1.0237 0.2306 -4.440 1.22e-05 ***
## schoolsupyes -2.2893 0.4354 -5.258 2.57e-07 ***
## famsupyes -0.5501 0.3240 -1.698 0.090440 .
## paidyes -0.4545 0.3122 -1.456 0.146276
## internetyes 0.7021 0.4118 1.705 0.089117 .
## goout -0.4796 0.1363 -3.519 0.000491 ***
## health -0.2657 0.1061 -2.504 0.012742 *
## absences -0.4324 0.1331 -3.249 0.001274 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 2.738 on 342 degrees of freedom
## Multiple R-squared: 0.3088, Adjusted R-squared: 0.2805
## F-statistic: 10.92 on 14 and 342 DF, p-value: < 2.2e-16
This model so far gives us the best R2 adjusted and the most significant variables.
stud_resid = rstudent(modelback)
dat_resid <- data.frame(RESID = stud_resid, ABS = abs(stud_resid), PRED = modelback$fitted.values)
dat_resid %>% arrange(-ABS) %>% filter(ABS > 2)
## RESID ABS PRED
## 1 2.679317 2.679317 11.786311
## 2 2.589617 2.589617 12.137775
## 3 2.561374 2.561374 12.126561
## 4 2.354490 2.354490 10.710387
## 5 -2.323288 2.323288 12.262156
## 6 2.311762 2.311762 11.976174
## 7 2.294170 2.294170 11.903973
## 8 2.293555 2.293555 11.813598
## 9 -2.243988 2.243988 11.046976
## 10 -2.194720 2.194720 11.911696
## 11 2.092021 2.092021 12.496235
## 12 2.082325 2.082325 8.422516
## 13 2.073858 2.073858 12.514650
## 14 -2.029878 2.029878 10.469839
## 15 -2.018927 2.018927 14.362543
nortest::ad.test(dat_resid$RESID)
##
## Anderson-Darling normality test
##
## data: dat_resid$RESID
## A = 0.52166, p-value = 0.1834
p1 <- ggplot(data = dat_resid) + geom_histogram(aes(RESID), bins = 25) + labs(x = "Studentized Residuals")
p2 <- ggplot(data = dat_resid) + geom_point(aes(PRED, RESID)) + geom_hline(aes(yintercept = 0)) + labs(x = "Predicted Values", y = "Studentized Residuals")
cowplot::plot_grid(p1, p2)
Of all possible interaction terms, several were significant. Among those, the most “interpretable” are studytime:schoolsupyes, studytime:famsupyes, studytime:goout, and schoolsupyes:goout. Interestingly, it appears that the effects of studytime are reduced when one goes out a lot; perhaps fatigue reduces the effectiveness of studying.
model.int <- lm(G3 ~ schoolMS + sexM + Mjobhealth + Mjobservices + Fjobteacher + studytime +
failures + schoolsupyes + famsupyes + paidyes + internetyes + goout + health +
absences + studytime:schoolsupyes + studytime:famsupyes + studytime:goout + schoolsupyes:goout,
data = student_d)
summary(model.int)
##
## Call:
## lm(formula = G3 ~ schoolMS + sexM + Mjobhealth + Mjobservices +
## Fjobteacher + studytime + failures + schoolsupyes + famsupyes +
## paidyes + internetyes + goout + health + absences + studytime:schoolsupyes +
## studytime:famsupyes + studytime:goout + schoolsupyes:goout,
## data = student_d)
##
## Residuals:
## Min 1Q Median 3Q Max
## -6.266 -1.836 0.025 1.867 7.307
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 11.2083 1.3897 8.065 1.29e-14 ***
## schoolMS -0.8319 0.4676 -1.779 0.076166 .
## sexM 0.5296 0.3131 1.692 0.091626 .
## Mjobhealth 1.7611 0.5262 3.347 0.000909 ***
## Mjobservices 1.3857 0.3399 4.076 5.71e-05 ***
## Fjobteacher 2.0753 0.5598 3.707 0.000245 ***
## studytime 1.4476 0.5834 2.481 0.013571 *
## failures -1.1339 0.2322 -4.882 1.62e-06 ***
## schoolsupyes 2.9375 1.6735 1.755 0.080109 .
## famsupyes -1.5010 0.8036 -1.868 0.062636 .
## paidyes -0.4000 0.3093 -1.293 0.196759
## internetyes 0.5941 0.4080 1.456 0.146287
## goout 0.2966 0.3555 0.834 0.404694
## health -0.2439 0.1050 -2.323 0.020777 *
## absences -0.3891 0.1323 -2.942 0.003488 **
## studytime:schoolsupyes -1.4596 0.5319 -2.744 0.006388 **
## studytime:famsupyes 0.4570 0.3738 1.223 0.222253
## studytime:goout -0.3478 0.1648 -2.111 0.035525 *
## schoolsupyes:goout -0.7078 0.3592 -1.970 0.049620 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 2.7 on 338 degrees of freedom
## Multiple R-squared: 0.3358, Adjusted R-squared: 0.3004
## F-statistic: 9.494 on 18 and 338 DF, p-value: < 2.2e-16
stud_resid = rstudent(model.int)
dat_resid <- data.frame(RESID = stud_resid, ABS = abs(stud_resid), PRED = model.int$fitted.values)
dat_resid %>% arrange(-ABS) %>% filter(ABS > 2)
## RESID ABS PRED
## 1 2.822477 2.822477 11.692634
## 2 2.748930 2.748930 11.778145
## 3 2.598079 2.598079 12.108456
## 4 2.388207 2.388207 11.894930
## 5 -2.359367 2.359367 12.266122
## 6 2.342987 2.342987 10.833667
## 7 2.342512 2.342512 11.864914
## 8 2.313796 2.313796 7.936474
## 9 2.310611 2.310611 11.872062
## 10 -2.263963 2.263963 11.014384
## 11 -2.231969 2.231969 10.750802
## 12 -2.181129 2.181129 11.792550
## 13 -2.023639 2.023639 12.367534
nortest::ad.test(dat_resid$RESID)
##
## Anderson-Darling normality test
##
## data: dat_resid$RESID
## A = 0.43554, p-value = 0.2974
Normality assumption is justified for model.int
p1 <- ggplot(data = dat_resid) + geom_histogram(aes(RESID), bins = 25) + labs(x = "Studentized Residuals")
p2 <- ggplot(data = dat_resid) + geom_point(aes(PRED, RESID)) + geom_hline(aes(yintercept = 0)) + labs(x = "Predicted Values", y = "Studentized Residuals")
cowplot::plot_grid(p1, p2)
modelsmall <- lm(G3 ~ Mjobhealth + Mjobservices + Fjobteacher + failures + schoolsupyes + goout, data = student_d)
summary(modelsmall)
##
## Call:
## lm(formula = G3 ~ Mjobhealth + Mjobservices + Fjobteacher + failures +
## schoolsupyes + goout, data = student_d)
##
## Residuals:
## Min 1Q Median 3Q Max
## -6.5735 -1.8793 -0.0725 1.9254 7.9254
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 13.0704 0.4698 27.819 < 2e-16 ***
## Mjobhealth 1.7801 0.5422 3.283 0.001130 **
## Mjobservices 1.3399 0.3522 3.804 0.000168 ***
## Fjobteacher 2.0188 0.5860 3.445 0.000640 ***
## failures -1.2676 0.2301 -5.508 7.05e-08 ***
## schoolsupyes -2.2548 0.4384 -5.143 4.51e-07 ***
## goout -0.4989 0.1405 -3.552 0.000435 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 2.855 on 350 degrees of freedom
## Multiple R-squared: 0.231, Adjusted R-squared: 0.2178
## F-statistic: 17.52 on 6 and 350 DF, p-value: < 2.2e-16
model.int satisfies linearity and homogeneity assumptions.
stud_resid_small = rstudent(modelsmall)
dat_resid_small <- data.frame(RESID = stud_resid_small, ABS = abs(stud_resid_small), PRED = modelsmall$fitted.values)
dat_resid_small %>% arrange(-ABS) %>% filter(ABS > 2)
## RESID ABS PRED
## 1 2.814574 2.814574 11.07460
## 2 2.630448 2.630448 11.57355
## 3 2.453589 2.453589 12.07249
## 4 -2.334139 2.334139 12.57143
## 5 -2.323367 2.323367 11.57355
## 6 2.281179 2.281179 10.57566
## 7 -2.277173 2.277173 12.41453
## 8 2.270601 2.270601 11.57355
## 9 2.270601 2.270601 11.57355
## 10 2.205028 2.205028 13.85261
## 11 2.166041 2.166041 11.91558
## 12 -2.146474 2.146474 12.07249
## 13 -2.146474 2.146474 12.07249
nortest::ad.test(dat_resid_small$RESID)
##
## Anderson-Darling normality test
##
## data: dat_resid_small$RESID
## A = 0.3526, p-value = 0.4646
Small model satisfies conditions for Gaussian assumption.
p1.small <- ggplot(data = dat_resid_small) + geom_histogram(aes(RESID), bins = 25) + labs(x = "Studentized Residuals")
p2.small <- ggplot(data = dat_resid_small) + geom_point(aes(PRED, RESID)) + geom_hline(aes(yintercept = 0)) + labs(x = "Predicted Values", y = "Studentized Residuals")
cowplot::plot_grid(p1.small, p2.small)
The small model also appears to satisfy the diagnostic checks for homogeneity and linearity. Existence is trivial, but we may to discuss limitations of the independence assumption (maybe kids form study groups or grades are based on a curve)
smallint <- lm(G3 ~ Mjobhealth + Mjobservices + Fjobteacher + studytime + failures + schoolsupyes + absences + studytime*schoolsupyes, data = student_d)
summary(smallint)
##
## Call:
## lm(formula = G3 ~ Mjobhealth + Mjobservices + Fjobteacher + studytime +
## failures + schoolsupyes + absences + studytime * schoolsupyes,
## data = student_d)
##
## Residuals:
## Min 1Q Median 3Q Max
## -6.3169 -1.8571 -0.0307 1.9693 8.5644
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 11.1451 0.5269 21.153 < 2e-16 ***
## Mjobhealth 1.5611 0.5367 2.909 0.003861 **
## Mjobservices 1.3264 0.3497 3.793 0.000176 ***
## Fjobteacher 2.0558 0.5830 3.526 0.000478 ***
## studytime 0.5550 0.1959 2.833 0.004884 **
## failures -1.1399 0.2328 -4.896 1.5e-06 ***
## schoolsupyes 0.1176 1.2027 0.098 0.922180
## absences -0.4215 0.1331 -3.168 0.001672 **
## studytime:schoolsupyes -1.1585 0.5398 -2.146 0.032571 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 2.83 on 348 degrees of freedom
## Multiple R-squared: 0.2484, Adjusted R-squared: 0.2312
## F-statistic: 14.38 on 8 and 348 DF, p-value: < 2.2e-16
stud_resid = rstudent(smallint)
dat_resid <- data.frame(RESID = stud_resid, ABS = abs(stud_resid), PRED = smallint$fitted.values)
dat_resid %>% arrange(-ABS) %>% filter(ABS > 2)
## RESID ABS PRED
## 1 3.084691 3.084691 10.43556
## 2 2.558145 2.558145 10.85706
## 3 2.515938 2.515938 11.96703
## 4 2.363309 2.363309 12.38852
## 5 2.362362 2.362362 11.47392
## 6 2.273524 2.273524 11.83483
## 7 -2.262882 2.262882 12.31691
## 8 2.213032 2.213032 12.91281
## 9 2.143890 2.143890 10.99055
## 10 -2.137058 2.137058 10.99055
## 11 -2.105066 2.105066 10.72103
## 12 2.088324 2.088324 13.15990
## 13 -2.077790 2.077790 11.83354
## 14 -2.077790 2.077790 11.83354
## 15 -2.077790 2.077790 11.83354
## 16 -2.077790 2.077790 11.83354
## 17 2.001355 2.001355 12.38852
## 18 2.000896 2.000896 14.50464
nortest::ad.test(dat_resid$RESID)
##
## Anderson-Darling normality test
##
## data: dat_resid$RESID
## A = 0.53226, p-value = 0.1725
p1 <- ggplot(data = dat_resid) + geom_histogram(aes(RESID), bins = 25) + labs(x = "Studentized Residuals")
p2 <- ggplot(data = dat_resid) + geom_point(aes(PRED, RESID)) + geom_hline(aes(yintercept = 0)) + labs(x = "Predicted Values", y = "Studentized Residuals")
cowplot::plot_grid(p1, p2)
model6 <- lm(G3 ~ Mjobhealth + Mjobservices + Fjobteacher + failures + schoolsupyes + goout + health + absences, data = student_d)
summary(model6)
##
## Call:
## lm(formula = G3 ~ Mjobhealth + Mjobservices + Fjobteacher + failures +
## schoolsupyes + goout + health + absences, data = student_d)
##
## Residuals:
## Min 1Q Median 3Q Max
## -6.4202 -1.9905 -0.1228 1.9217 7.3935
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 14.6123 0.6314 23.144 < 2e-16 ***
## Mjobhealth 1.7919 0.5345 3.352 0.000890 ***
## Mjobservices 1.4685 0.3475 4.225 3.05e-05 ***
## Fjobteacher 2.0296 0.5748 3.531 0.000470 ***
## failures -1.1001 0.2304 -4.775 2.65e-06 ***
## schoolsupyes -2.3415 0.4306 -5.438 1.02e-07 ***
## goout -0.4730 0.1382 -3.421 0.000697 ***
## health -0.2581 0.1069 -2.415 0.016248 *
## absences -0.4285 0.1318 -3.250 0.001265 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 2.8 on 348 degrees of freedom
## Multiple R-squared: 0.2643, Adjusted R-squared: 0.2473
## F-statistic: 15.62 on 8 and 348 DF, p-value: < 2.2e-16
** keep this or should we not use stuff from the subset analysis?
stud_resid = rstudent(model6)
dat_resid <- data.frame(RESID = stud_resid, ABS = abs(stud_resid), PRED = model6$fitted.values)
dat_resid %>% arrange(-ABS) %>% filter(ABS > 2)
## RESID ABS PRED
## 1 2.679809 2.679809 11.60650
## 2 2.595935 2.595935 11.82016
## 3 2.529514 2.529514 12.03383
## 4 2.415042 2.415042 11.30389
## 5 -2.329295 2.329295 12.42020
## 6 2.291485 2.291485 10.70386
## 7 2.259156 2.259156 11.73239
## 8 -2.212688 2.212688 11.13353
## 9 -2.157793 2.157793 11.99053
## 10 -2.085166 2.085166 11.77569
## 11 2.076754 2.076754 13.33347
## 12 -2.073113 2.073113 10.70386
## 13 2.066341 2.066341 12.38753
## 14 2.064937 2.064937 12.45968
## 15 -2.033755 2.033755 11.61277
## 16 2.024998 2.024998 12.47611
## 17 2.009772 2.009772 14.51357
nortest::ad.test(dat_resid$RESID)
##
## Anderson-Darling normality test
##
## data: dat_resid$RESID
## A = 0.54719, p-value = 0.1582
p1 <- ggplot(data = dat_resid) + geom_histogram(aes(RESID), bins = 25) + labs(x = "Studentized Residuals")
p2 <- ggplot(data = dat_resid) + geom_point(aes(PRED, RESID)) + geom_hline(aes(yintercept = 0)) + labs(x = "Predicted Values", y = "Studentized Residuals")
cowplot::plot_grid(p1, p2)
model6_all <- lm(G3 ~ Mjobhealth + Mjobservices + Fjobteacher + failures + schoolsupyes + goout + health + absences, data = student_d_all)
summary(model6_all)
##
## Call:
## lm(formula = G3 ~ Mjobhealth + Mjobservices + Fjobteacher + failures +
## schoolsupyes + goout + health + absences, data = student_d_all)
##
## Residuals:
## Min 1Q Median 3Q Max
## -13.1619 -1.7970 0.4408 2.7480 8.4881
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 12.1885 0.8886 13.716 < 2e-16 ***
## Mjobhealth 2.2767 0.7659 2.972 0.00314 **
## Mjobservices 1.5218 0.4909 3.100 0.00208 **
## Fjobteacher 1.4156 0.8087 1.751 0.08083 .
## failures -2.1902 0.2891 -7.576 2.66e-13 ***
## schoolsupyes -1.2548 0.6285 -1.996 0.04659 *
## goout -0.4418 0.1910 -2.313 0.02123 *
## health -0.1973 0.1523 -1.295 0.19609
## absences 0.2879 0.1885 1.527 0.12755
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 4.164 on 386 degrees of freedom
## Multiple R-squared: 0.1907, Adjusted R-squared: 0.1739
## F-statistic: 11.37 on 8 and 386 DF, p-value: 1.699e-14
Influence – Cook’s Distance - Values near 1 are bad
Outlier – Leverage Values - Values greater than 2*p/n are bad
p <- modelback$coefficients %>% length()
n <- nrow(student_d)
levcutoff <- 2*p/n
lev <- hatvalues(modelback)
cook <- cooks.distance(modelback)
take out values that are outlliers and run the regression again?
Aside from looking at the Adjusted R2, can look at the AIC or SBC/BIC (smaller is better in both cases) can add more/different models to this code, but here is the basis
aic <- c(AIC(model1), AIC(model2), AIC(modelback), AIC(model.int), AIC(modelsmall), AIC(model6), AIC(smallint))
bic <- c(BIC(model1), BIC(model2), BIC(modelback), BIC(model.int), BIC(modelsmall), BIC(model6), BIC(smallint))
adjR <- c( summary(model1)$adj.r.sq, summary(model2)$adj.r.sq, summary(modelback)$adj.r.sq, summary(model.int)$adj.r.sq, summary(modelsmall)$adj.r.sq, summary(model6)$adj.r.sq, summary(smallint)$adj.r.sq)
selection <- data.frame(model = c("Model 1: Full Model", "Model 2: No Intercept", "Model 3: Stepwise Model", "Model 4: Stepwise Model + Interaction Terms", "Model 5: Reduced Stepwise Model", "Model 6: Best Subset Model", "Model 7: Reduced Interaction Model"), AICs = aic, BICs = bic, Adjusted.R.Squared = adjR)
selection %>% knitr::kable(align = c("c", "c"))
| model | AICs | BICs | Adjusted.R.Squared |
|---|---|---|---|
| Model 1: Full Model | 1782.550 | 1933.781 | 0.2549348 |
| Model 2: No Intercept | 1805.236 | 1952.590 | 0.9420873 |
| Model 3: Stepwise Model | 1748.918 | 1810.962 | 0.2805423 |
| Model 4: Stepwise Model + Interaction Terms | 1742.705 | 1820.259 | 0.3004419 |
| Model 5: Reduced Stepwise Model | 1771.009 | 1802.031 | 0.2178292 |
| Model 6: Best Subset Model | 1759.233 | 1798.011 | 0.2473398 |
| Model 7: Reduced Interaction Model | 1766.823 | 1805.600 | 0.2311681 |
[1] Insert Dataset Reference